{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HTML解析实践\n",
    "\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 上周回顾及翻页思考\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# A1  nfu.edu.cn \n",
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/xxyw/index.htm\")\n",
    " \n",
    "r.status_code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "## html 页面数据的存与读"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_文学与传媒学院.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_文学与传媒学院.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
    "## soup_html 解析 ： str的html文件 => element html元素文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x18271c21a90>"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "parsed = requests_html.soup_parse(html_load)\n",
    "parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 解析和重塑链接（内容链接）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/xxyw/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae92f7535a7d60c9.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04becfaf48b3138e20.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a2cdc278fc884ea.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d241e79365b6290.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/debb2f222e024cbda5d2644acb6c552c.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/e5378134dbaf4b7b88d3003f1cd99e59.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/7c865b16b203467ab6ddf5569f73e5c1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/28b0ad0eee8149e6b7f4ae65395910ff.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/c48c33c8f744430eb9417b800a8b2e3f.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/395b8e2ba5df47c59d080d50d1113be1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/59bda093ced440f78c638ade40ab0b93.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/1af5590575b74762b624f048b5ad79f4.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/4e32521de0da4d21979182e1b114a964.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/23279088871e4b89b8eab2e7fbc77b17.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/a5de3999469447b488857144f58f8c27.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/6273fd9185b54b20a0af15b9878f1d2c.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/a1f9ac1d39704e4d8136478ec97e3635.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/c438a1ec6db5446faf76617654b5ca55.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/f28729353ff749b9b170825ffe346949.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/f25bba2bf25d43399a88598c769bb779.htm']"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>快！来为我校大学生国旗护卫队参赛点赞！</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae...</td>\n",
       "      <td>2021-04-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>专注当下，冲刺高考，奋斗出最美的青春</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04be...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>我校召开2021年一流专业、一流课程、教学成果奖申报工作推进会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>我校承办首届 “新时代从商培养工程”</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>广东工业大学华立学院来访我校</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/debb2f222e024cbda5...</td>\n",
       "      <td>2021-03-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>“疫情下的中国、美国以及中美关系”高层论坛暨广州南方学院“美国研究中心”成立五周年纪念研讨会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/e5378134dbaf4b7b88...</td>\n",
       "      <td>2021-03-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>“思想政治理论第一课”上，学校党委书记、校长讲了这些！</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/7c865b16b203467ab6...</td>\n",
       "      <td>2021-03-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>学生报道|传承红色基因，讲好党员故事：我校举办身边优秀党员事迹分享会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/28b0ad0eee8149e6b7...</td>\n",
       "      <td>2021-03-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>权威发布：我校名列2021年中国民办高校第四</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/c48c33c8f744430eb9...</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>喜讯：电气学院学子在国际顶尖期刊发表学术论文</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/395b8e2ba5df47c59d...</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>我校召开行政人员专题培训暨2021年春季学期第一次办公室工作例会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/59bda093ced440f78c...</td>\n",
       "      <td>2021-03-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>我校开展2021年春季学期教学检查工作</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/1af5590575b74762b6...</td>\n",
       "      <td>2021-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>媒体报道我校入选国家级一流本科专业建设点情况</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/4e32521de0da4d2197...</td>\n",
       "      <td>2021-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>我校举行转设更名挂牌仪式</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/23279088871e4b89b8...</td>\n",
       "      <td>2021-03-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>广东省教育厅公布国家级一流专业建设点数量排名：我校位列广东高校第29，同类院校第1</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/a5de3999469447b488...</td>\n",
       "      <td>2021-03-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>我校政商研究院学生胡志翔在国际期刊发表论文</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/6273fd9185b54b20a0...</td>\n",
       "      <td>2021-03-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>我校举行大一学生升旗仪式暨晨跑之星颁奖大会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/a1f9ac1d39704e4d81...</td>\n",
       "      <td>2021-03-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>我校召开党史学习教育动员大会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/c438a1ec6db5446faf...</td>\n",
       "      <td>2021-03-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>我校组织参加全省教育系统党史学习教育动员部署会视频会议</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f28729353ff749b9b1...</td>\n",
       "      <td>2021-03-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>喜讯：我校计算机科学与技术专业获IEET认证</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f25bba2bf25d43399a...</td>\n",
       "      <td>2021-03-08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                标题  \\\n",
       "0                              快！来为我校大学生国旗护卫队参赛点赞！   \n",
       "1                               专注当下，冲刺高考，奋斗出最美的青春   \n",
       "2                  我校召开2021年一流专业、一流课程、教学成果奖申报工作推进会   \n",
       "3                               我校承办首届 “新时代从商培养工程”   \n",
       "4                                   广东工业大学华立学院来访我校   \n",
       "5   “疫情下的中国、美国以及中美关系”高层论坛暨广州南方学院“美国研究中心”成立五周年纪念研讨会   \n",
       "6                      “思想政治理论第一课”上，学校党委书记、校长讲了这些！   \n",
       "7               学生报道|传承红色基因，讲好党员故事：我校举办身边优秀党员事迹分享会   \n",
       "8                           权威发布：我校名列2021年中国民办高校第四   \n",
       "9                           喜讯：电气学院学子在国际顶尖期刊发表学术论文   \n",
       "10                我校召开行政人员专题培训暨2021年春季学期第一次办公室工作例会   \n",
       "11                             我校开展2021年春季学期教学检查工作   \n",
       "12                          媒体报道我校入选国家级一流本科专业建设点情况   \n",
       "13                                    我校举行转设更名挂牌仪式   \n",
       "14       广东省教育厅公布国家级一流专业建设点数量排名：我校位列广东高校第29，同类院校第1   \n",
       "15                           我校政商研究院学生胡志翔在国际期刊发表论文   \n",
       "16                           我校举行大一学生升旗仪式暨晨跑之星颁奖大会   \n",
       "17                                  我校召开党史学习教育动员大会   \n",
       "18                     我校组织参加全省教育系统党史学习教育动员部署会视频会议   \n",
       "19                          喜讯：我校计算机科学与技术专业获IEET认证   \n",
       "\n",
       "                                                   链结          日期  \n",
       "0   https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae...  2021-04-09  \n",
       "1   https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04be...  2021-04-02  \n",
       "2   https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a...  2021-04-02  \n",
       "3   https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d...  2021-04-02  \n",
       "4   https://www.nfu.edu.cn/xxyw/debb2f222e024cbda5...  2021-03-31  \n",
       "5   https://www.nfu.edu.cn/xxyw/e5378134dbaf4b7b88...  2021-03-31  \n",
       "6   https://www.nfu.edu.cn/xxyw/7c865b16b203467ab6...  2021-03-29  \n",
       "7   https://www.nfu.edu.cn/xxyw/28b0ad0eee8149e6b7...  2021-03-29  \n",
       "8   https://www.nfu.edu.cn/xxyw/c48c33c8f744430eb9...  2021-03-26  \n",
       "9   https://www.nfu.edu.cn/xxyw/395b8e2ba5df47c59d...  2021-03-26  \n",
       "10  https://www.nfu.edu.cn/xxyw/59bda093ced440f78c...  2021-03-19  \n",
       "11  https://www.nfu.edu.cn/xxyw/1af5590575b74762b6...  2021-03-15  \n",
       "12  https://www.nfu.edu.cn/xxyw/4e32521de0da4d2197...  2021-03-15  \n",
       "13  https://www.nfu.edu.cn/xxyw/23279088871e4b89b8...  2021-03-12  \n",
       "14  https://www.nfu.edu.cn/xxyw/a5de3999469447b488...  2021-03-12  \n",
       "15  https://www.nfu.edu.cn/xxyw/6273fd9185b54b20a0...  2021-03-12  \n",
       "16  https://www.nfu.edu.cn/xxyw/a1f9ac1d39704e4d81...  2021-03-12  \n",
       "17  https://www.nfu.edu.cn/xxyw/c438a1ec6db5446faf...  2021-03-11  \n",
       "18  https://www.nfu.edu.cn/xxyw/f28729353ff749b9b1...  2021-03-09  \n",
       "19  https://www.nfu.edu.cn/xxyw/f25bba2bf25d43399a...  2021-03-08  "
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 输出结果\n",
    "# B-D-1 pd.DataFrame 建构，pandas课有教\n",
    "df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "     } )\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "# B-D-2 pd.DataFrame 输出excel，pandas课有教\n",
    "df.to_excel(\"data_out/nfu_文学与传媒学院8.xlsx\", sheet_name=\"检索结果\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 本周内容1:如何实现翻页？\n",
    "\n",
    "* 1. 翻页链接有何区别？\n",
    "* 2. 有多少页？\n",
    "* 3. 实现翻页的url队列\n",
    "* 4. 批量存html文件\n",
    "* 5. 批量存excel文件"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 翻页链接有何区别？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/xxyw/index.htm'"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第一页\n",
    "base_url_01 = r.url\n",
    "base_url_01"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SplitResult(scheme='https', netloc='www.nfu.edu.cn', path='/xxyw/index.htm', query='', fragment='')"
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlsplit(base_url_01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/xxyw/index.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               第一页\n",
       "0            https\n",
       "1   www.nfu.edu.cn\n",
       "2  /xxyw/index.htm\n",
       "3                 \n",
       "4                 "
      ]
     },
     "execution_count": 158,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(urllib.parse.urlsplit(base_url_01)).rename({0:\"第一页\"},axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/xxyw/index2.htm'"
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第二页\n",
    "base_url_02 = session.get('https://www.nfu.edu.cn/xxyw/index2.htm').url\n",
    "base_url_02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "      <th>第二页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/xxyw/index.htm</td>\n",
       "      <td>/xxyw/index2.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               第一页               第二页\n",
       "0            https             https\n",
       "1   www.nfu.edu.cn    www.nfu.edu.cn\n",
       "2  /xxyw/index.htm  /xxyw/index2.htm\n",
       "3                                   \n",
       "4                                   "
      ]
     },
     "execution_count": 160,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['第二页'] = urllib.parse.urlsplit(base_url_02)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 有多少页？\n",
    "* 第三页...  第n页？多少页？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "91\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/xxyw/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break\n",
    "# so page = 19?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 实现翻页的url队列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xxyw/index1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index2.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index3.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index4.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index5.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index6.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index7.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index8.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index9.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index10.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index11.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index12.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index13.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index14.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index15.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index16.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index17.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index18.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index19.htm']"
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group = ['https://www.nfu.edu.cn/xxyw/index'+str(i)+'.htm' for i in range(1,20)]\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "url_group.insert(0,'https://www.nfu.edu.cn/xxyw/index.htm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xxyw/index.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index2.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index3.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index4.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index5.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index6.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index7.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index8.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index9.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index10.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index11.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index12.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index13.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index14.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index15.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index16.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index17.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index18.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index19.htm']"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/xxyw/index.htm'"
      ]
     },
     "execution_count": 165,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_group[0]).path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 批量存html文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'html_out//xxyw/index.htm'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-166-30e80ca1c450>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;31m# 设置存档html的名称\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[0mpath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m     \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m'html_out/'\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencoding\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"utf8\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmode\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"w\"\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfp\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m         \u001b[0mfp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'html_out//xxyw/index.htm'"
     ]
    }
   ],
   "source": [
    "for url in url_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "# 设置存档html的名称\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)\n",
    "    \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 批量存excel文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath 准备：\n",
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 页面详细链接拼接准备\n",
    "def pages_content_url(parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 循环遍历出结果\n",
    "# os模块，取出某文件夹中的文件 组成列表\n",
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/xxyw/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/xxyw/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        #准备链接，学校官网才用\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "# 拼接表格 pandas 方法concat       \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)  #通过日期进行文章的排序，\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_官网.xxyw',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='学校要闻')\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 课后实践，增加sheet，完成一下数据抓取\n",
    "\n",
    "* 学校要闻\n",
    "* 校园动态\n",
    "* 通知公告\n",
    "* 招投标\n",
    "* 高教动态\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 下周预习目标\n",
    "## 使用 xpath 应用 [m.liepin.com](https://m.liepin.com/zhaopin/)\n",
    "\n",
    "你是数据科学家，这m.liepin.com有什么样的牛肉，你想怎么抓？\n",
    "* 工作名称\n",
    "* 工作地点\n",
    "* 工作\n",
    "* ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "336px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
